Both counts have the same genes analysed. These amount to nearly 54,000 genes, which corresponds to the whole genome of mice.
gene_biotypes <- table(gastroc$gene_biotype) %>%
as.data.frame() %>%
arrange(desc(Freq))
treemap::treemap(gene_biotypes,
index = "Var1",
vSize = "Freq",
title = "Gene biotypes")
For a broad overview all genes with rowSums() < 10
will be displayed. This is usually similar to omitting genes with zero
counts of more than 30%
rs.gastroc <- rowSums(counts.gastroc) %>%
as.data.frame() %>%
dplyr::rename("rowSums" = ".")
n.omit.gastroc <- sum(rs.gastroc < 10)
title <- paste0("rowSums low expressed genes, gastroc \n",
n.omit.gastroc, " genes will be omitted (~",
floor(n.omit.gastroc / nrow(rs.gastroc) * 100), " %)")
p.gastroc <- filter(rs.gastroc, rowSums < 20) %>%
ggplot(., aes(x=rowSums))+
geom_histogram(color="black", fill="white")+
geom_vline(aes(xintercept=10, color="red"),
linetype="dashed") +
ggtitle(title) +
theme(legend.position = "none")
# ---
rs.soleus <- rowSums(counts.soleus) %>%
as.data.frame() %>%
dplyr::rename("rowSums" = ".")
n.omit.soleus <- sum(rs.soleus < 10)
title <- paste0("rowSums low expressed genes, soleus \n",
n.omit.soleus, " genes will be omitted (~",
floor(n.omit.gastroc / nrow(rs.gastroc) * 100), " %)")
p.soleus <- filter(rs.soleus, rowSums < 20) %>%
ggplot(., aes(x=rowSums))+
geom_histogram(color="black", fill="white")+
geom_vline(aes(xintercept=10, color="red"),
linetype="dashed") +
ggtitle(title)+
theme(legend.position = "none")
p.gastroc
p.soleus
omitting genes with zero counts in more than 30% of the samples
plot.library_size <- function(counts, title="") {
cs <- colSums(counts) %>%
as.data.frame() %>%
dplyr::rename("colsums" = ".") %>%
tibble::rownames_to_column(var = "sample")
ggplot(cs, aes(x = sample, y = colsums)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(color = "black", angle=45, vjust=.8, hjust=0.8)) +
ggtitle(paste0("total counts, ", title))
}
plt.libsize.gastroc <- plot.library_size(counts.gastroc, title = "gastroc") +
ylim(0, 4e07)
plt.libsize.soleus <- plot.library_size(counts.soleus, title = "soleus") +
ylim(0, 4e07)
ggpubr::ggarrange(plt.libsize.gastroc, plt.libsize.soleus,
ncol = 1, nrow = 2)
not sure what to make of this, but its there.
Preparing the sample by applying log2 and loading metadata files
Resulting Plots:
plotBoxLog <- function(logcounts, metadata) {
stack(logcounts) %>%
merge(metadata, by.x = "ind", by.y = "sample_name") %>%
ggplot( aes(x=ind, y=values, color=genotype)) +
geom_boxplot() +
ylab("Log2(Counts)") +
xlab(element_blank()) +
theme(axis.text.x=element_text(angle=45, vjust=.8, hjust=0.8)) +
geom_hline(yintercept = median(as.matrix(logcounts.gastroc)), col="blue" )
}
p1 <- plotBoxLog(logcounts.gastroc, metadata.gastroc) +
ggtitle("gastroc")
p2 <- plotBoxLog(logcounts.soleus, metadata.soleus) +
ggtitle("soleus") +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank()) +
rremove("ylab")
Just for exploration purpose, but the resulting data is not used in the following analysis.
p1 <- DESeq2::rlog(as.matrix(counts.gastroc)) %>%
as.data.frame() %>%
plotBoxLog(metadata.gastroc) +
ggtitle("gastroc") +
ylab("rlog(counts)")
p2 <- DESeq2::rlog(as.matrix(counts.soleus)) %>%
as.data.frame() %>%
plotBoxLog(metadata.soleus) +
ggtitle("soleus") +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank()) +
rremove("ylab")
To obtain the most variable genes, a z-score will be applied to then take the sd and filter for the top 1000
\(\frac{x - mean(x)}{sd(x)}\)
mostVariableRows <- function(M, ntop=1000) {
s <- apply( M,1,sd ) # standard deviation
µ <- apply( M, 1, mean ) # mean
M.z <- (M - µ) / s # zscore
# sd ordered descending
sdev <- apply(counts.gastroc, 1, sd)
M.top <- M[order(sdev, decreasing = T)[1:ntop] , ]
return(M.top)
}
counts.gastroc.top <- mostVariableRows(counts.gastroc)
counts.soleus.top <- mostVariableRows(counts.soleus)
To visualize the top 1000 most variable genes, I tried to generate
some Heatmaps. (This time not using DESeq2::rlog, but just
log2(counts). Using the raw counts I could try to just
apply log2 to all counts and then create a heatmap.
ComplexHeatmapThe following Heatmaps are done with ComplexHeatmap but
are very bad looking.
# Get some nicer colours
mypalette <- brewer.pal(11, "RdYlBu")
# http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
morecols <- colorRampPalette(mypalette)
chm.gastroc <- Heatmap(
log2(counts.gastroc.top + 1) %>% as.matrix(),
row_title = "genes",
name = "log2",
show_row_names = FALSE,
column_title = "samples",
col = rev(morecols(50)),
column_title_side = "bottom"
# row_dend_side = "right",
)
# saving the CHM as png, to adjust the resolution (I don't know another method)
png(
"./plots/01_CHM_top_variable_genes.gastroc.png",
width = 10,
height = 7.5,
res = 150,
units = "in"
)
draw(chm.gastroc)
invisible(dev.off())
chm.soleus <- Heatmap(
log2(counts.soleus.top + 1) %>% as.matrix(),
row_title = "genes",
name = "log2",
show_row_names = FALSE,
column_title = "samples",
col = rev(morecols(50)),
column_title_side = "bottom"
# row_dend_side = "right",
)
# saving the CHM as png, to adjust the resolution (I don't know another method)
png(
"./plots/01_CHM_top_variable_genes.soleus.png",
width = 10,
height = 7.5,
res = 150,
units = "in"
)
draw(chm.soleus)
invisible(dev.off())
gplots::heatmap.2With this package, the heatmaps are seemingly generated differently,
even though I thought both packages use hclust(). Not sure
yet what is the difference.
# Set up colour vector for celltype variable
col.cell <- rep(c("#00C3C6"), 12)
col.cell[metadata.gastroc$genotype == "KO"] = "#FF6C67"
heatmap.2(log2(as.matrix(counts.gastroc.top + 1)),
labRow = FALSE,
col=rev(morecols(50)),
ColSideColors=col.cell, scale="row",
trace="column",
main="gastroc",
lmat=rbind(c(5,4), c(3,2), c(0,1)),
lhei=c(2.2,4,0.2))
heatmap.2(log2(as.matrix(counts.soleus.top + 1)),
labRow = FALSE,
col=rev(morecols(50)),
ColSideColors=col.cell, scale="row",
trace="column",
main="soleus",
lmat=rbind(c(5,4), c(3,2), c(0,1)),
lhei=c(2.2,4,0.2))
the pca is performed on the raw data (not z-scored) => Thus using
the scale.=T argument?
pca <- prcomp(t(counts.gastroc.top), scale. = T)
pca.data <- data.frame(Sample = rownames(pca$x),
X = pca$x[, 1],
Y = pca$x[, 2]) %>%
mutate(type = substr(Sample, 1,2))
plt <-
autoplot(
pca,
data = pca.data,
colour = 'type',
label.show.legend = FALSE
) +
ggtitle("PCA gastroc")
# ggsave(filename = "./plots/04_pca_gastroc_filtered.png", plt, dpi=300)
plt
createDDSObject <- function(counts, metadata) {
# select sample columns
reorder_index <- match(rownames(metadata), colnames(counts))
counts <- counts[,reorder_index]
# Check metadata consistency
all(rownames(metadata) %in% colnames(counts)) %>%
assertthat::assert_that(., msg = "metadata and count table do not match")
## DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design = ~ genotype)
return( DESeq(dds) )
}
# creating DESeq Objects
dds.gastroc <- createDDSObject(counts.gastroc, metadata.gastroc)
dds.soleus <- createDDSObject(counts.soleus, metadata.soleus)
plotMA(dds.gastroc)
plotMA(dds.soleus)
plotDispEsts(dds.gastroc)
plotDispEsts(dds.soleus)
head(res.gastroc)
log2 fold change (MLE): genotype WT vs KO
Wald test p-value: genotype WT vs KO
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000057003 1263134 1.528571 0.188761 8.09790 5.59144e-16 1.11629e-14
ENSMUSG00000064351 1072094 0.464631 0.154018 3.01674 2.55513e-03 7.20561e-03
ENSMUSG00000031972 901274 0.915253 0.102929 8.89212 5.99562e-19 1.64640e-17
ENSMUSG00000061723 723692 1.407953 0.123315 11.41751 3.41876e-30 2.64747e-28
ENSMUSG00000032366 645951 1.635209 0.155699 10.50238 8.42310e-26 4.55336e-24
ENSMUSG00000030695 606244 0.605202 0.114231 5.29807 1.17031e-07 7.66958e-07
head(res.soleus)
log2 fold change (MLE): genotype WT vs KO
Wald test p-value: genotype WT vs KO
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000051951 0.877455 0.4953332 1.516260 0.326681 0.7439092 NA
ENSMUSG00000025900 801.345381 0.3712948 0.160227 2.317311 0.0204868 0.124199
ENSMUSG00000025902 875.547200 -0.0583178 0.198690 -0.293512 0.7691312 0.903197
ENSMUSG00000104238 1.556245 0.7166751 1.103738 0.649316 0.5161341 NA
ENSMUSG00000102269 11.634924 0.6967718 0.450394 1.547027 0.1218568 0.361899
ENSMUSG00000096126 1.623686 -1.0224528 0.965597 -1.058881 0.2896538 NA
topGenes.gastroc <- as.data.frame(res.gastroc) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
topGenes.soleus <- as.data.frame(res.soleus) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
knitr::kable(head(topGenes.gastroc), caption = "gastroc")
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000050335 | 4934.2949 | -6.267396 | 0.1664065 | -37.66316 | 0 | 0 |
| ENSMUSG00000081249 | 1378.5460 | -4.208222 | 0.1115955 | -37.70959 | 0 | 0 |
| ENSMUSG00000010751 | 1864.4937 | -5.610357 | 0.2014267 | -27.85309 | 0 | 0 |
| ENSMUSG00000032712 | 5136.5059 | -4.686418 | 0.1683204 | -27.84225 | 0 | 0 |
| ENSMUSG00000034855 | 554.3475 | -7.276611 | 0.2735037 | -26.60517 | 0 | 0 |
| ENSMUSG00000037613 | 960.7558 | -3.183338 | 0.1266510 | -25.13472 | 0 | 0 |
knitr::kable(head(topGenes.soleus), caption = "soleus")
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000038615 | 32047.185 | 2.5454285 | 0.0636829 | 39.97038 | 0 | 0 |
| ENSMUSG00000081249 | 1651.592 | -4.5759032 | 0.1292779 | -35.39587 | 0 | 0 |
| ENSMUSG00000039067 | 4355.557 | 1.1049116 | 0.0419786 | 26.32084 | 0 | 0 |
| ENSMUSG00000028932 | 3190.657 | 1.1102258 | 0.0558536 | 19.87741 | 0 | 0 |
| ENSMUSG00000006998 | 7402.617 | 0.9196943 | 0.0468631 | 19.62511 | 0 | 0 |
| ENSMUSG00000102869 | 6985.622 | 0.9181929 | 0.0479924 | 19.13203 | 0 | 0 |
hist(res.gastroc$pvalue, main = "gastroc")
hist(res.soleus$pvalue, main = "soleus")
Is shrinking necessary?
[] watch the course video
resTab.gastroc <- as.data.frame(res.gastroc) %>%
tibble::rownames_to_column("GeneID") %>%
# left_join(ensemblAnnot, by = "GeneID") %>%
dplyr::rename(logFC = log2FoldChange, FDR = padj) %>%
mutate(`-log10(pvalue)` = -log10(pvalue))
resTab.soleus <- as.data.frame(res.soleus) %>%
tibble::rownames_to_column("GeneID") %>%
# left_join(ensemblAnnot, by = "GeneID") %>%
dplyr::rename(logFC = log2FoldChange, FDR = padj) %>%
mutate(`-log10(pvalue)` = -log10(pvalue))
ggplot(resTab.gastroc, aes(x = logFC, y = `-log10(pvalue)`)) +
geom_point(aes(colour = FDR < 0.05), size = 1) +
geom_text(data = ~top_n(.x, 1, wt=-FDR), aes(label = GeneID))
ggplot(resTab.soleus, aes(x = logFC, y = `-log10(pvalue)`)) +
geom_point(aes(colour = FDR < 0.05), size = 1) +
geom_text(data = ~top_n(.x, 1, wt=-FDR), aes(label = GeneID))
saveRDS(counts.gastroc, file = "./data/Robjects/counts.gastroc.rds")
saveRDS(counts.soleus, file = "./data/Robjects/counts.soleus.rds")
save(dds.gastroc, dds.soleus, file = "./data/Robjects/DDS.RData")
Most variable genes:
PCA:
scale=T argument?DESeq:
lfcShrink)